/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package mj.ocraptor.extraction.tika.parser.microsoft;

import java.io.ByteArrayInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import javax.imageio.ImageIO;

import mj.ocraptor.configuration.Config;
import mj.ocraptor.configuration.properties.ConfigBool;
import mj.ocraptor.extraction.image_processing.TikaImageHelper;

import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.HWPFOldDocument;
import org.apache.poi.hwpf.OldWordFileFormatException;
import org.apache.poi.hwpf.extractor.Word6Extractor;
import org.apache.poi.hwpf.model.FieldsDocumentPart;
import org.apache.poi.hwpf.model.PicturesTable;
import org.apache.poi.hwpf.model.StyleDescription;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Field;
import org.apache.poi.hwpf.usermodel.HeaderStories;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.hwpf.usermodel.Table;
import org.apache.poi.hwpf.usermodel.TableCell;
import org.apache.poi.hwpf.usermodel.TableRow;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;

public class WordExtractor extends AbstractPOIFSExtractor {

  private static final char UNICODECHAR_NONBREAKING_HYPHEN = '\u2011';
  private static final char UNICODECHAR_ZERO_WIDTH_SPACE = '\u200b';
  private Metadata metadata;

  public WordExtractor(ParseContext context, Metadata metadata) {
    super(context);
    this.metadata = metadata;
  }

  // True if we are currently in the named style tag:
  private boolean curStrikeThrough;
  private boolean curBold;
  private boolean curItalic;

  protected void parse(NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml)
      throws IOException, SAXException, TikaException {
    parse(filesystem.getRoot(), xhtml);
  }

  // mj
  private void extractImageText(XHTMLContentHandler xhtml, HWPFDocument document) {
    if (Config.inst().getProp(ConfigBool.ENABLE_IMAGE_OCR)) {
      TikaImageHelper helper = new TikaImageHelper(metadata);
      try {
        List<Picture> pictures2 = document.getPicturesTable().getAllPictures();
        for (Picture picture : pictures2) {
          ByteArrayInputStream imageData = new ByteArrayInputStream(
              picture.getContent());
          helper.addImage(ImageIO.read(imageData));
        }
        // TODO: find out page number
        helper.addTextToHandler(xhtml);
      } catch (Exception e) {
        e.printStackTrace();
      } finally {
        if (helper != null) {
          helper.close();
        }
      }
    }
  }

  protected void parse(DirectoryNode root, XHTMLContentHandler xhtml)
      throws IOException, SAXException, TikaException {
    HWPFDocument document;
    try {
      document = new HWPFDocument(root);
    } catch (OldWordFileFormatException e) {
      parseWord6(root, xhtml);
      return;
    }

    org.apache.poi.hwpf.extractor.WordExtractor wordExtractor = new org.apache.poi.hwpf.extractor.WordExtractor(
        document);

    // mj
    extractImageText(xhtml, document);

    HeaderStories headerFooter = new HeaderStories(document);

    // Grab the list of pictures. As far as we can tell,
    // the pictures should be in order, and may be directly
    // placed or referenced from an anchor
    PicturesTable pictureTable = document.getPicturesTable();
    PicturesSource pictures = new PicturesSource(document);

    // Do any headers, if present
    Range[] headers = new Range[] { headerFooter.getFirstHeaderSubrange(),
        headerFooter.getEvenHeaderSubrange(),
        headerFooter.getOddHeaderSubrange() };
    handleHeaderFooter(headers, "header", document, pictures, pictureTable,
        xhtml);

    // Do the main paragraph text
    Range r = document.getRange();
    for (int i = 0; i < r.numParagraphs(); i++) {
      Paragraph p = r.getParagraph(i);
      i += handleParagraph(p, 0, r, document, FieldsDocumentPart.MAIN,
          pictures, pictureTable, xhtml);
    }

    // Do everything else
    for (String paragraph : wordExtractor.getMainTextboxText()) {
      xhtml.element("p", paragraph);
    }

    for (String paragraph : wordExtractor.getFootnoteText()) {
      xhtml.element("p", paragraph);
    }

    for (String paragraph : wordExtractor.getCommentsText()) {
      xhtml.element("p", paragraph);
    }

    for (String paragraph : wordExtractor.getEndnoteText()) {
      xhtml.element("p", paragraph);
    }

    // Do any footers, if present
    Range[] footers = new Range[] { headerFooter.getFirstFooterSubrange(),
        headerFooter.getEvenFooterSubrange(),
        headerFooter.getOddFooterSubrange() };
    handleHeaderFooter(footers, "footer", document, pictures, pictureTable,
        xhtml);

    // Handle any pictures that we haven't output yet
    for (Picture p = pictures.nextUnclaimed(); p != null;) {
      handlePictureCharacterRun(null, p, pictures, xhtml);
      p = pictures.nextUnclaimed();
    }

    // Handle any embeded office documents
    try {
      DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool");
      for (Entry entry : op) {
        if (entry.getName().startsWith("_") && entry instanceof DirectoryEntry) {
          handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml);
        }
      }
    } catch (FileNotFoundException e) {
    }
  }

  private static int countParagraphs(Range... ranges) {
    int count = 0;
    for (Range r : ranges) {
      if (r != null) {
        count += r.numParagraphs();
      }
    }
    return count;
  }

  private void handleHeaderFooter(Range[] ranges, String type,
      HWPFDocument document, PicturesSource pictures,
      PicturesTable pictureTable, XHTMLContentHandler xhtml)
      throws SAXException, IOException, TikaException {
    if (countParagraphs(ranges) > 0) {
      xhtml.startElement("div", "class", type);
      for (Range r : ranges) {
        if (r != null) {
          for (int i = 0; i < r.numParagraphs(); i++) {
            Paragraph p = r.getParagraph(i);

            String text = p.text();
            if (text.replaceAll("[\\r\\n\\s]+", "").isEmpty()) {
              // Skip empty header or footer paragraphs
            } else {
              i += handleParagraph(p, 0, r, document,
                  FieldsDocumentPart.HEADER, pictures, pictureTable, xhtml);
            }
          }
        }
      }
      xhtml.endElement("div");
    }
  }

  private int handleParagraph(Paragraph p, int parentTableLevel, Range r,
      HWPFDocument document, FieldsDocumentPart docPart,
      PicturesSource pictures, PicturesTable pictureTable,
      XHTMLContentHandler xhtml) throws SAXException, IOException,
      TikaException {
    // Note - a poi bug means we can't currently properly recurse
    // into nested tables, so currently we don't
    if (p.isInTable() && p.getTableLevel() > parentTableLevel
        && parentTableLevel == 0) {
      Table t = r.getTable(p);
      xhtml.startElement("table");
      xhtml.startElement("tbody");
      for (int rn = 0; rn < t.numRows(); rn++) {
        TableRow row = t.getRow(rn);
        xhtml.startElement("tr");
        for (int cn = 0; cn < row.numCells(); cn++) {
          TableCell cell = row.getCell(cn);
          xhtml.startElement("td");

          for (int pn = 0; pn < cell.numParagraphs(); pn++) {
            Paragraph cellP = cell.getParagraph(pn);
            handleParagraph(cellP, p.getTableLevel(), cell, document, docPart,
                pictures, pictureTable, xhtml);
          }
          xhtml.endElement("td");
        }
        xhtml.endElement("tr");
      }
      xhtml.endElement("tbody");
      xhtml.endElement("table");
      return (t.numParagraphs() - 1);
    }

    TagAndStyle tas;

    if (document.getStyleSheet().numStyles() > p.getStyleIndex()) {
      StyleDescription style = document.getStyleSheet().getStyleDescription(
          p.getStyleIndex());
      if (style != null && style.getName() != null
          && style.getName().length() > 0) {
        tas = buildParagraphTagAndStyle(style.getName(), (parentTableLevel > 0));
      } else {
        tas = new TagAndStyle("p", null);
      }
    } else {
      tas = new TagAndStyle("p", null);
    }

    if (tas.getStyleClass() != null) {
      xhtml.startElement(tas.getTag(), "class", tas.getStyleClass());
    } else {
      xhtml.startElement(tas.getTag());
    }

    for (int j = 0; j < p.numCharacterRuns(); j++) {
      CharacterRun cr = p.getCharacterRun(j);

      // FIELD_BEGIN_MARK:
      if (cr.text().getBytes()[0] == 0x13) {
        Field field = document.getFields().getFieldByStartOffset(docPart,
            cr.getStartOffset());
        // 58 is an embedded document
        // 56 is a document link
        if (field != null && (field.getType() == 58 || field.getType() == 56)) {
          // Embedded Object: add a <div
          // class="embedded" id="_X"/> so consumer can see where
          // in the main text each embedded document
          // occurred:
          String id = "_"
              + field.getMarkSeparatorCharacterRun(r).getPicOffset();
          AttributesImpl attributes = new AttributesImpl();
          attributes.addAttribute("", "class", "class", "CDATA", "embedded");
          attributes.addAttribute("", "id", "id", "CDATA", id);
          xhtml.startElement("div", attributes);
          xhtml.endElement("div");
        }
      }

      if (cr.text().equals("\u0013")) {
        j += handleSpecialCharacterRuns(p, j, tas.isHeading(), pictures, xhtml);
      } else if (cr.text().startsWith("\u0008")) {
        // Floating Picture(s)
        for (int pn = 0; pn < cr.text().length(); pn++) {
          // Assume they're in the order from the unclaimed list...
          Picture picture = pictures.nextUnclaimed();

          // Output
          handlePictureCharacterRun(cr, picture, pictures, xhtml);
        }
      } else if (pictureTable.hasPicture(cr)) {
        // Inline Picture
        Picture picture = pictures.getFor(cr);
        handlePictureCharacterRun(cr, picture, pictures, xhtml);
      } else {
        handleCharacterRun(cr, tas.isHeading(), xhtml);
      }
    }

    // Close any still open style tags
    if (curStrikeThrough) {
      xhtml.endElement("s");
      curStrikeThrough = false;
    }
    if (curItalic) {
      xhtml.endElement("i");
      curItalic = false;
    }
    if (curBold) {
      xhtml.endElement("b");
      curBold = false;
    }

    xhtml.endElement(tas.getTag());

    return 0;
  }

  private void handleCharacterRun(CharacterRun cr, boolean skipStyling,
      XHTMLContentHandler xhtml) throws SAXException {
    // Skip trailing newlines
    if (!isRendered(cr) || cr.text().equals("\r"))
      return;

    if (!skipStyling) {
      if (cr.isBold() != curBold) {
        // Enforce nesting -- must close s and i tags
        if (curStrikeThrough) {
          xhtml.endElement("s");
          curStrikeThrough = false;
        }
        if (curItalic) {
          xhtml.endElement("i");
          curItalic = false;
        }
        if (cr.isBold()) {
          xhtml.startElement("b");
        } else {
          xhtml.endElement("b");
        }
        curBold = cr.isBold();
      }

      if (cr.isItalic() != curItalic) {
        // Enforce nesting -- must close s tag
        if (curStrikeThrough) {
          xhtml.endElement("s");
          curStrikeThrough = false;
        }
        if (cr.isItalic()) {
          xhtml.startElement("i");
        } else {
          xhtml.endElement("i");
        }
        curItalic = cr.isItalic();
      }

      if (cr.isStrikeThrough() != curStrikeThrough) {
        if (cr.isStrikeThrough()) {
          xhtml.startElement("s");
        } else {
          xhtml.endElement("s");
        }
        curStrikeThrough = cr.isStrikeThrough();
      }
    }

    // Clean up the text
    String text = cr.text();
    text = text.replace('\r', '\n');
    if (text.endsWith("\u0007")) {
      // Strip the table cell end marker
      text = text.substring(0, text.length() - 1);
    }

    // Copied from POI's
    // org/apache/poi/hwpf/converter/AbstractWordConverter.processCharacters:

    // line tabulator as break line
    text = text.replace((char) 0x000b, '\n');

    // Non-breaking hyphens are returned as char 30
    text = text.replace((char) 30, UNICODECHAR_NONBREAKING_HYPHEN);

    // Non-required hyphens to zero-width space
    text = text.replace((char) 31, UNICODECHAR_ZERO_WIDTH_SPACE);

    // TODO: mj
    xhtml.characters(text);
  }

  /**
   * Can be \13..text..\15 or \13..control..\14..text..\15 . Nesting is allowed
   */
  private int handleSpecialCharacterRuns(Paragraph p, int index,
      boolean skipStyling, PicturesSource pictures, XHTMLContentHandler xhtml)
      throws SAXException, TikaException, IOException {
    List<CharacterRun> controls = new ArrayList<CharacterRun>();
    List<CharacterRun> texts = new ArrayList<CharacterRun>();
    boolean has14 = false;

    // Split it into before and after the 14
    int i;
    for (i = index + 1; i < p.numCharacterRuns(); i++) {
      CharacterRun cr = p.getCharacterRun(i);
      if (cr.text().equals("\u0013")) {
        // Nested, oh joy...
        int increment = handleSpecialCharacterRuns(p, i + 1, skipStyling,
            pictures, xhtml);
        i += increment;
      } else if (cr.text().equals("\u0014")) {
        has14 = true;
      } else if (cr.text().equals("\u0015")) {
        if (!has14) {
          texts = controls;
          controls = new ArrayList<CharacterRun>();
        }
        break;
      } else {
        if (has14) {
          texts.add(cr);
        } else {
          controls.add(cr);
        }
      }
    }

    // Do we need to do something special with this?
    if (controls.size() > 0) {
      String text = controls.get(0).text();
      for (int j = 1; j < controls.size(); j++) {
        text += controls.get(j).text();
      }

      if ((text.startsWith("HYPERLINK") || text.startsWith(" HYPERLINK"))
          && text.indexOf('"') > -1) {
        String url = text.substring(text.indexOf('"') + 1,
            text.lastIndexOf('"'));
        xhtml.startElement("a", "href", url);
        for (CharacterRun cr : texts) {
          handleCharacterRun(cr, skipStyling, xhtml);
        }
        xhtml.endElement("a");
      } else {
        // Just output the text ones
        for (CharacterRun cr : texts) {
          if (pictures.hasPicture(cr)) {
            Picture picture = pictures.getFor(cr);
            handlePictureCharacterRun(cr, picture, pictures, xhtml);
          } else {
            handleCharacterRun(cr, skipStyling, xhtml);
          }
        }
      }
    } else {
      // We only had text
      // Output as-is
      for (CharacterRun cr : texts) {
        handleCharacterRun(cr, skipStyling, xhtml);
      }
    }

    // Tell them how many to skip over
    return i - index;
  }

  private void handlePictureCharacterRun(CharacterRun cr, Picture picture,
      PicturesSource pictures, XHTMLContentHandler xhtml) throws SAXException,
      IOException, TikaException {
    if (!isRendered(cr) || picture == null) {
      // Oh dear, we've run out...
      // Probably caused by multiple \u0008 images referencing
      // the same real image
      return;
    }

    // Which one is it?
    String extension = picture.suggestFileExtension();
    int pictureNumber = pictures.pictureNumber(picture);

    // Make up a name for the picture
    // There isn't one in the file, but we need to be able to reference
    // the picture from the img tag and the embedded resource
    String filename = "image" + pictureNumber
        + (extension.length() > 0 ? "." + extension : "");

    // Grab the mime type for the picture
    String mimeType = picture.getMimeType();

    // Output the img tag
    AttributesImpl attr = new AttributesImpl();
    attr.addAttribute("", "src", "src", "CDATA", "embedded:" + filename);
    attr.addAttribute("", "alt", "alt", "CDATA", filename);
    xhtml.startElement("img", attr);
    xhtml.endElement("img");

    // Have we already output this one?
    // (Only expose each individual image once)
    if (!pictures.hasOutput(picture)) {
      TikaInputStream stream = TikaInputStream.get(picture.getContent());
      handleEmbeddedResource(stream, filename, null, mimeType, xhtml, false);
      pictures.recordOutput(picture);
    }
  }

  /**
   * Outputs a section of text if the given text is non-empty.
   *
   * @param xhtml
   *          XHTML content handler
   * @param section
   *          the class of the &lt;div/&gt; section emitted
   * @param text
   *          text to be emitted, if any
   * @throws SAXException
   *           if an error occurs
   */
  private void addTextIfAny(XHTMLContentHandler xhtml, String section,
      String text) throws SAXException {
    if (text != null && text.length() > 0) {
      xhtml.startElement("div", "class", section);
      xhtml.element("p", text);
      xhtml.endElement("div");
    }
  }

  protected void parseWord6(NPOIFSFileSystem filesystem,
      XHTMLContentHandler xhtml) throws IOException, SAXException,
      TikaException {
    parseWord6(filesystem.getRoot(), xhtml);
  }

  protected void parseWord6(DirectoryNode root, XHTMLContentHandler xhtml)
      throws IOException, SAXException, TikaException {
    HWPFOldDocument doc = new HWPFOldDocument(root);
    Word6Extractor extractor = new Word6Extractor(doc);

    for (String p : extractor.getParagraphText()) {
      xhtml.element("p", p);
    }
  }

  private static final Map<String, TagAndStyle> fixedParagraphStyles = new HashMap<String, TagAndStyle>();
  private static final TagAndStyle defaultParagraphStyle = new TagAndStyle("p",
      null);
  static {
    fixedParagraphStyles.put("Default", defaultParagraphStyle);
    fixedParagraphStyles.put("Normal", defaultParagraphStyle);
    fixedParagraphStyles.put("heading", new TagAndStyle("h1", null));
    fixedParagraphStyles.put("Heading", new TagAndStyle("h1", null));
    fixedParagraphStyles.put("Title", new TagAndStyle("h1", "title"));
    fixedParagraphStyles.put("Subtitle", new TagAndStyle("h2", "subtitle"));
    fixedParagraphStyles.put("HTML Preformatted", new TagAndStyle("pre", null));
  }

  /**
   * Given a style name, return what tag should be used, and what style should
   * be applied to it.
   */
  public static TagAndStyle buildParagraphTagAndStyle(String styleName,
      boolean isTable) {
    TagAndStyle tagAndStyle = fixedParagraphStyles.get(styleName);
    if (tagAndStyle != null) {
      return tagAndStyle;
    }

    if (styleName.equals("Table Contents") && isTable) {
      return defaultParagraphStyle;
    }

    String tag = "p";
    String styleClass = null;

    if (styleName.startsWith("heading") || styleName.startsWith("Heading")) {
      // "Heading 3" or "Heading2" or "heading 4"
      int num = 1;
      try {
        num = Integer.parseInt(styleName.substring(styleName.length() - 1));
      } catch (NumberFormatException e) {
      }
      // Turn it into a H1 - H6 (H7+ isn't valid!)
      tag = "h" + Math.min(num, 6);
    } else {
      styleClass = styleName.replace(' ', '_');
      styleClass = styleClass.substring(0, 1).toLowerCase()
          + styleClass.substring(1);
    }

    return new TagAndStyle(tag, styleClass);
  }

  public static class TagAndStyle {
    private String tag;
    private String styleClass;

    public TagAndStyle(String tag, String styleClass) {
      this.tag = tag;
      this.styleClass = styleClass;
    }

    public String getTag() {
      return tag;
    }

    public String getStyleClass() {
      return styleClass;
    }

    public boolean isHeading() {
      return tag.length() == 2 && tag.startsWith("h");
    }
  }

  /**
   * Determines if character run should be included in the extraction.
   *
   * @param cr
   *          character run.
   * @return true if character run should be included in extraction.
   */
  private boolean isRendered(final CharacterRun cr) {
    return cr == null || !cr.isMarkedDeleted();
  }

  /**
   * Provides access to the pictures both by offset, iteration over the
   * un-claimed, and peeking forward
   */
  private static class PicturesSource {
    private PicturesTable picturesTable;
    private Set<Picture> output = new HashSet<Picture>();
    private Map<Integer, Picture> lookup;
    private List<Picture> nonU1based;
    private List<Picture> all;
    private int pn = 0;

    private PicturesSource(HWPFDocument doc) {
      picturesTable = doc.getPicturesTable();
      all = picturesTable.getAllPictures();

      // Build the Offset-Picture lookup map
      lookup = new HashMap<Integer, Picture>();
      for (Picture p : all) {
        lookup.put(p.getStartOffset(), p);
      }

      // Work out which Pictures aren't referenced by
      // a \u0001 in the main text
      // These are \u0008 escher floating ones, ones
      // found outside the normal text, and who
      // knows what else...
      nonU1based = new ArrayList<Picture>();
      nonU1based.addAll(all);
      Range r = doc.getRange();
      for (int i = 0; i < r.numCharacterRuns(); i++) {
        CharacterRun cr = r.getCharacterRun(i);
        if (picturesTable.hasPicture(cr)) {
          Picture p = getFor(cr);
          int at = nonU1based.indexOf(p);
          nonU1based.set(at, null);
        }
      }
    }

    private boolean hasPicture(CharacterRun cr) {
      return picturesTable.hasPicture(cr);
    }

    private void recordOutput(Picture picture) {
      output.add(picture);
    }

    private boolean hasOutput(Picture picture) {
      return output.contains(picture);
    }

    private int pictureNumber(Picture picture) {
      return all.indexOf(picture) + 1;
    }

    private Picture getFor(CharacterRun cr) {
      return lookup.get(cr.getPicOffset());
    }

    /**
     * Return the next unclaimed one, used towards the end
     */
    private Picture nextUnclaimed() {
      Picture p = null;
      while (pn < nonU1based.size()) {
        p = nonU1based.get(pn);
        pn++;
        if (p != null)
          return p;
      }
      return null;
    }
  }
}
